{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from IPython.core.interactiveshell import InteractiveShell\n",
    "InteractiveShell.ast_node_interactivity = 'all'  # default is ‘last_expr'\n",
    "\n",
    "%load_ext autoreload\n",
    "%autoreload 2\n",
    "\n",
    "import sys\n",
    "sys.path.append('/Users/siyuyang/Source/repos/GitHub_MSFT/CameraTraps_data')  # append this repo to PYTHONPATH\n",
    "\n",
    "import json\n",
    "import os\n",
    "from collections import Counter, defaultdict\n",
    "from random import sample\n",
    "import math\n",
    "\n",
    "from tqdm import tqdm\n",
    "\n",
    "from data_management.megadb.schema import sequences_schema_check\n",
    "from data_management.annotations.add_bounding_boxes_to_megadb import *\n",
    "from data_management.megadb.converters.cct_to_megadb import make_cct_embedded, process_sequences, write_json"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Importing the Island Conservation 20190529 drop \n",
    "\n",
    "This dataset is different in that the class labels are on the bounding box-level. To make it intuitive for future queries, since we only query the `class` attribute on the sequences and images when gathering training data, we add the species label from each bounding box to the `class` attribute at the image level. The sequences are dummy sequences - no sequence information in the original dataset.\n",
    "\n",
    "The timestamps are not read from the image metadata."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "path_to_output_public = '.../CameraTrap/Databases/megadb_2020/to_ingest/island_conservation_200529_megadb.json'\n",
    "path_to_output_private = '.../CameraTrap/Databases/megadb_2020/to_ingest/island_conservation_200529_private_megadb.json'\n",
    "\n",
    "dataset_name = 'islandconservation_200529'\n",
    "dataset_private_name = 'islandconservation_200529_private'\n",
    "label_map = {'human': 'person', 'vehicle': 'vehicle', 'empty': 'empty'}\n",
    "\n",
    "# Path to the CCT json, or a loaded json object\n",
    "path_to_image_cct = '.../data/CameraTraps/CCT_JSONs/island_conservation.json'\n",
    "path_to_bbox_cct = '.../data/CameraTraps/CCT_JSONs/island_conservation.json'\n",
    "assert not (path_to_image_cct is None and path_to_bbox_cct is None)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Loading image DB...\n",
      "Number of items from the image DB: 127410\n",
      "Number of images with more than 1 species: 5927 (4.65% of image DB)\n",
      "Loading bbox DB...\n",
      "Number of images added from bbox DB entries:  0\n",
      "Number of images amended:  0\n",
      "Number of items in total:  127410\n",
      "Number of images with more than one bounding box: 5927 (4.651911152970724% of all entries)\n"
     ]
    }
   ],
   "source": [
    "embedded = make_cct_embedded(image_db=path_to_image_cct, bbox_db=path_to_bbox_cct)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The dataset_name is set to islandconservation_200529. Please make sure this is correct!\n",
      "Making a deep copy of docs...\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      " 20%|██        | 25994/127410 [00:00<00:00, 129953.86it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Putting 127410 images into sequences...\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 127410/127410 [00:00<00:00, 134468.37it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of sequences: 127410\n",
      "Checking the location field...\n",
      "Checking which fields in a CCT image entry are sequence-level...\n",
      "\n",
      "all_img_properties\n",
      "{'id', 'class', 'file', 'bbox'}\n",
      "\n",
      "img_level_properties\n",
      "{'image_id', 'file'}\n",
      "\n",
      "image-level properties that really should be sequence-level\n",
      "{'id', 'class', 'bbox'}\n",
      "\n",
      "Finished processing sequences.\n",
      "Example sequence items:\n",
      "\n",
      "{\"dataset\": \"islandconservation_200529\", \"seq_id\": \"dummy_cdfb0f599923412f8c265ca691237d7b\", \"images\": [{\"file\": \"dominicanrepublic/camara02/cam0226junio2015/dominicanrepublic_cam0226junio2015_20131026_063520_sunp0022.jpg\"}], \"class\": [\"cow\"], \"id\": \"dominicanrepublic_camara02_cam0226junio2015_dominicanrepublic_cam0226junio2015_20131026_063520_sunp0022\", \"bbox\": [{\"category\": \"cow\", \"bbox\": [0, 0.141, 0.397, 0.427]}]}\n",
      "\n",
      "[{\"dataset\": \"islandconservation_200529\", \"seq_id\": \"dummy_d3fc351fbe004817b3ca3c49c802523f\", \"images\": [{\"file\": \"dominicanrepublic/camara04/cam0425abril2015/dominicanrepublic_cam0425abril2015_20130809_045413_sunp1757.jpg\"}], \"class\": [\"empty\"], \"id\": \"dominicanrepublic_camara04_cam0425abril2015_dominicanrepublic_cam0425abril2015_20130809_045413_sunp1757\", \"bbox\": []}]\n",
      "\n"
     ]
    }
   ],
   "source": [
    "sequences = process_sequences(embedded, dataset_name)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "# this code snippet is by Vardhan\n",
    "# since all sequences are \"dummy\" sequences, we move the labels back to the image level so it\n",
    "# makes more sense to future queries\n",
    "\n",
    "for index, entry in enumerate(sequences):\n",
    "    for prop in ['id', 'bbox', 'class']:\n",
    "        if prop == 'conf' or prop == 'class':\n",
    "            sequences[index]['images'][0][prop] = entry[prop]\n",
    "        elif prop == 'bbox':\n",
    "            for pos, ele in enumerate(entry[prop]):\n",
    "                ele['class'] = ele['category']\n",
    "                ele['category'] = label_map.get(ele['category'], 'animal')\n",
    "                entry[prop][pos] = ele\n",
    "            sequences[index]['images'][0][prop] = entry[prop]\n",
    "        del entry[prop]\n",
    "    sequences[index] = entry"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "122602"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/plain": [
       "4808"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# separate out the private vs public sets because the images are in different containers\n",
    "sequences_non_human = []\n",
    "sequences_human = []\n",
    "\n",
    "for seq in sequences:\n",
    "    if 'human' in seq['images'][0]['class']:\n",
    "        sequences_human.append(seq)\n",
    "    else:\n",
    "        sequences_non_human.append(seq)\n",
    "        \n",
    "len(sequences_non_human)\n",
    "len(sequences_human)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "There are exactly 4808 images in the 'human' folder in the private container, so looks good."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "# change the dataset name field of the human set\n",
    "\n",
    "for seq in sequences_human:\n",
    "    seq['dataset'] = dataset_private_name"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Verified that the sequence items meet requirements not captured by the schema.\n",
      "Verified that the sequence items conform to the schema.\n"
     ]
    }
   ],
   "source": [
    "sequences_schema_check.sequences_schema_check(sequences_non_human)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Verified that the sequence items meet requirements not captured by the schema.\n",
      "Verified that the sequence items conform to the schema.\n"
     ]
    }
   ],
   "source": [
    "sequences_schema_check.sequences_schema_check(sequences_human)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "write_json(path_to_output_public, sequences_non_human)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "write_json(path_to_output_private, sequences_human)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Add width and height back in for the non-human set\n",
    "Using existing width and height to avoid downloading each image."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from data_management.cct_json_utils import IndexedJsonDb"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "cct_json_db = IndexedJsonDb(path_to_image_cct)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "file_name_to_dims = {}\n",
    "for image_id, item in cct_json_db.image_id_to_image.items():\n",
    "    file_name_to_dims[item['file_name']] = item"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def round_to_int(f):\n",
    "    return int(round(f))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for seq in sequences_non_human:\n",
    "    for im in seq['images']:\n",
    "        item = file_name_to_dims[im['file']]\n",
    "        im['width'] = item['width']\n",
    "        im['height'] = item['height']\n",
    "        \n",
    "        image_width = im['width']\n",
    "        image_height = im['height']\n",
    "        \n",
    "        for b in im['bbox']:\n",
    "            coords = b['bbox']\n",
    "            x = max(round_to_int(coords[0] * image_width), 0)\n",
    "            y = max(round_to_int(coords[1] * image_height), 0)\n",
    "            box_w = min(round_to_int(coords[2] * image_width), image_width)\n",
    "            box_h = min(round_to_int(coords[3] * image_height), image_height)\n",
    "\n",
    "            b['bbox'] = [x, y, box_w, box_h]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "write_json('/Users/siyuyang/Source/temp_data/CameraTrap/megadb_from_cct/island_conservation_200529/megadb_to_cct_files/island_conservation_200529_megadb_abs.json', sequences_non_human)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Command for converting from these MegaDB entries back to CCT (\"one source of truth\"):\n",
    "\n",
    "```\n",
    "python data_management/megadb/converters/megadb_to_cct.py \"islandconservation_200529\" /Users/siyuyang/Source/temp_data/CameraTrap/megadb_from_cct/island_conservation_200529/megadb_to_cct_files/island_conservation_200529_megadb_abs.json /Users/siyuyang/Source/temp_data/CameraTrap/megadb_from_cct/island_conservation_200529/megadb_to_cct_files/island_conservation_200529_megadb_to_cct.json\n",
    "```\n",
    "\n",
    "```\n",
    "...\n",
    "Final CCT DB has 154379 image entries, and 213562 annotation entries.\n",
    "```"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python [conda env:cameratraps] *",
   "language": "python",
   "name": "conda-env-cameratraps-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
